

* get raw articles (not included due to copyright reasons)
use "raw articles/cluster1.dta", clear
append using "raw articles/cluster2.dta"
append using "raw articles/cluster3.dta"
append using "raw articles/cluster4.dta"
append using "raw articles/cluster5.dta"

* clean
drop if page == 2015
drop if outlet == "REPLIK TILL: Bättre att ha jobb att gå till än att få bidrag"
drop if outlet == `"Reflektioner till HD-artikel "Varje bil behöver tre parkeringsplatser""'
drop if outlet == "Bröllop Idag gifter vi oss. Marita Björk och Pär Maltesson. Torsby"

* drop perfect duplicates
duplicates drop headline text date outlet, force
duplicates drop id, force


* adjust newspaper names
replace outlet = "Bärgslagsbladet" if outlet == "Bärgslagsbladet/Arboga Tidning"
replace outlet = "Dagens Industri" if outlet == "Dagens industri"
replace outlet = "Dalademokraten" if outlet == "Dala-Demokraten"
replace outlet = "Bengtsforstidningen Dalslänningen" if outlet == "Dalslänningen"
replace outlet = "Enköpingsposten" if outlet == "Enköpings-Posten"
replace outlet = "Eskilstunakuriren" if outlet == "Eskilstuna Kuriren"
replace outlet = "Fagerstaposten" if outlet == "Fagersta-Posten"
replace outlet = "Falukuriren" if outlet == "Falu Kuriren"
replace outlet = "Folkbladet Norrköping" if outlet == "Folkbladet"
replace outlet = "Göteborgsposten" if outlet == "Göteborgs-Posten"
replace outlet = "Härjedalen" if outlet == "Tidningen Härjedalen"
replace outlet = "Jönköpingsposten" if outlet == "Jönköpings-Posten"
replace outlet = "Kalmar Läns Tidning" if outlet == "Kalmar Läns Tidning/Nybro Tidning"
replace outlet = "Karlskoga Tidning" if outlet == "Karlskoga Tidning -Kuriren"
replace outlet = "Karlstadstidningen" if outlet == "Karlstads Tidningen"
replace outlet = "Katrineholmskuriren" if outlet == "Katrineholms-Kuriren"
replace outlet = "Kungsbackaposten" if outlet == "Kungsbacka-Posten"
replace outlet = "Kungälvsposten" if outlet == "Kungälvs-Posten"
replace outlet = "Ljusdalsposten" if outlet == "Ljusdals-Posten"
replace outlet = "Mariestadstidningen" if outlet == "Mariestads-Tidningen"
replace outlet = "Motala Vadstena Tidning" if outlet == "Motala & Vadstena Tidning"
replace outlet = "Mölndalsposten" if outlet == "Mölndals-Posten"
replace outlet = "Norrbottenskuriren" if outlet == "Norrbottens-Kuriren"
replace outlet = "Nya Kristinehamnsposten" if outlet == "Nya Kristinehamns-Posten"
replace outlet = "Nya Wermlandstidningen" if outlet == "Nya Wermlands-Tidningen"
replace outlet = "Nynäshamnsposten" if outlet == "Nynäshamns-Posten"
replace outlet = "Piteåtidningen" if outlet == "Piteå-Tidningen"
replace outlet = "Smålandstidningen" if outlet == "Smålands-Tidningen"
replace outlet = "Strömstads Tidning" if outlet == "Strömstads Tidning / Norra Bohuslän"
replace outlet = "Säffletidningen" if outlet == "Säffle-Tidningen"
replace outlet = "Söderhamnskuriren" if outlet == "Söderhamns-Kuriren"
replace outlet = "Ttela" if outlet == "TTELA"
replace outlet = "Uppsala Nya Tidning" if outlet == "Upsala Nya Tidning"
replace outlet = "Vetlandaposten" if outlet == "Vetlanda-Posten"
replace outlet = "Länstidningen Värmlandsbygden" if outlet == "Värmlandsbygden"
replace outlet = "Västerbottenskuriren" if outlet == "Västerbottens-Kuriren"
replace outlet = "Västervikstidningen" if outlet == "Västerviks-Tidningen"
replace outlet = "Västgötabladet" if outlet == "Västgöta-Bladet"
replace outlet = "Växjöbladet Kronobergaren" if outlet == "Växjöbladet/Kronobergaren"
replace outlet = "Östersundsposten" if outlet == "Östersunds-Posten"
replace outlet = "Östran" if outlet == "Östra Småland"

* merge ownership data
gen tm = mofd(date)
format tm %tm
merge m:1 outlet tm using "newspapers and owners/newspaper month data.dta"
keep if _merge == 3
drop _merge

* content duplication
duplicates tag date headline owner, gen(dup_headline)
replace dup_headline = . if headline == "Ingen rubrik tillgänglig"
duplicates tag date text owner, gen(dup_text)
duplicates tag date headline text owner, gen(dup_headline_text)
replace dup_headline_text = . if headline == "Ingen rubrik tillgänglig"

gen wordcount = wordcount(headline) + wordcount(text)

* agency reports
gen agency = 0
replace agency = 1 if regexm(text, "\(TT\)") | regexm(text, "/TT") | ///
                      regexm(text, "TT-") | regexm(text, "-TT")
replace agency = 1 if regexm(headline, "\(TT\)") | regexm(headline, "/TT") | ///
                      regexm(headline, "TT-") | regexm(headline, "-TT")

					  					  
keep id date dup_headline dup_text dup_headline_text wordcount agency outlet
save "wordcount_temp.dta", replace



* get human annotations (not included for copyright reasons)
use "prepare training data\all_ids.dta", clear

drop if missing(type)

foreach var of varlist type topic level context format style geo {
	tostring `var', replace
	replace `var' = "LABEL_" + `var'
	replace `var' = "" if `var' == "LABEL_."
}

keep id type topic level context format style geo
gen predicted = 0
save "annotated temp.dta", replace




* get predictions
import delimited "article predictions\predictions_type_1.csv", clear
drop score
rename label type
save "predictions_type_temp.dta", replace

import delimited "article predictions\predictions_type_2.csv", clear
drop score
rename label type
append using "predictions_type_temp.dta"
save "predictions_type_temp.dta", replace


import delimited "article predictions\predictions_topic_1.csv", clear
drop score
rename label topic
save "predictions_topic_temp.dta", replace

import delimited "article predictions\predictions_topic_2.csv", clear
drop score
rename label topic
append using "predictions_topic_temp.dta"
save "predictions_topic_temp.dta", replace


import delimited "article predictions\predictions_level_1.csv", clear
drop score
rename label level
save "predictions_level_temp.dta", replace

import delimited "article predictions\predictions_level_2.csv", clear
drop score
rename label level
append using "predictions_level_temp.dta"
save "predictions_level_temp.dta", replace


import delimited "article predictions\predictions_context_1.csv", clear
drop score
rename label context
save "predictions_context_temp.dta", replace

import delimited "article predictions\predictions_context_2.csv", clear
drop score
rename label context
append using "predictions_context_temp.dta"
save "predictions_context_temp.dta", replace


import delimited "article predictions\predictions_format_1.csv", clear
drop score
rename label format
save "predictions_format_temp.dta", replace

import delimited "article predictions\predictions_format_2.csv", clear
drop score
rename label format
append using "predictions_format_temp.dta"
save "predictions_format_temp.dta", replace


import delimited "article predictions\predictions_style_1.csv", clear
drop score
rename label style
save "predictions_style_temp.dta", replace

import delimited "article predictions\predictions_style_2.csv", clear
drop score
rename label style
append using "predictions_style_temp.dta"
save "predictions_style_temp.dta", replace


import delimited "article predictions\predictions_geo_1.csv", clear
drop score
rename label geo
save "predictions_geo_temp.dta", replace

import delimited "article predictions\predictions_geo_2.csv", clear
drop score
rename label geo
append using "predictions_geo_temp.dta"

merge 1:1 id using "predictions_type_temp.dta"
drop _merge

merge 1:1 id using "predictions_topic_temp.dta"
drop _merge

merge 1:1 id using "predictions_level_temp.dta"
drop _merge

merge 1:1 id using "predictions_context_temp.dta"
drop _merge

merge 1:1 id using "predictions_format_temp.dta"
drop _merge

merge 1:1 id using "predictions_style_temp.dta"
drop _merge


gen predicted = 1
append using "annotated temp.dta"


merge 1:1 id using "wordcount_temp.dta"
keep if _merge == 3
drop _merge

erase "predictions_type_temp.dta"
erase "predictions_topic_temp.dta"
erase "predictions_level_temp.dta"
erase "predictions_context_temp.dta"
erase "predictions_format_temp.dta"
erase "predictions_style_temp.dta"
erase "predictions_geo_temp.dta"
erase "wordcount_temp.dta"
erase "annotated temp.dta"

gen tm = mofd(date)
replace geo = "LABEL_1" if geo == "LABEL_2"


* get word and article counts by journalistic vs. non-journalistic content
gen words_avg_journ = wordcount
replace words_avg_journ = . if type == "LABEL_1"
gen words_avg_nonjourn = wordcount
replace words_avg_nonjourn = . if type == "LABEL_0"

gen words_tot_journ = wordcount
replace words_tot_journ = 0 if type == "LABEL_1"
gen words_tot_nonjourn = wordcount
replace words_tot_nonjourn = 0 if type == "LABEL_0"

gen articles_journ = 0
replace articles_journ = 1 if type == "LABEL_0"
gen articles_nonjourn = 0
replace articles_nonjourn = 1 if type == "LABEL_1"



* set quality variables to missing for non-journalistic content
foreach var of varlist topic level context format style geo {
	replace `var' = "" if type == "LABEL_1"
}

foreach var of varlist agency dup_headline dup_text dup_headline_text {
	replace `var' = . if type == "LABEL_1"
}

	
***** weighting
gen points_topic = .
replace points_topic = 10 if topic == "LABEL_0" // politics
replace points_topic = 9 if topic == "LABEL_1" | topic == "LABEL_2" // econ/business, arts/culture
replace points_topic = 4 if topic == "LABEL_3" // sports
replace points_topic = 3 if topic == "LABEL_4" // human interest/other

gen points_level = .
replace points_level = 10 if level == "LABEL_0" // macro
replace points_level = 8 if level == "LABEL_1" // meso
replace points_level = 6 if level == "LABEL_2" // micro functional
replace points_level = 1 if level == "LABEL_3" // micro other

gen points_context = .
replace points_context = 10 if context == "LABEL_0" // thematic
replace points_context = 2 if context == "LABEL_1" // episodic

gen points_format = .
replace points_format = 10 if format == "LABEL_1" | format == "LABEL_4" | format == "LABEL_5"
replace points_format = 9 if format == "LABEL_0" | format == "LABEL_3"
replace points_format = 5 if format == "LABEL_2"
replace points_format = 5 if agency == 1
replace format = "LABEL_2" if agency == 1

gen points_style = .
replace points_style = 10 if style == "LABEL_0" // cognitive-normative
replace points_style = 2 if style == "LABEL_1" // moral-emotional


gen points_relevance = (points_topic*points_level)/10
gen points_context_overall = (points_context*0.6)+(points_format*0.4)
gen points_objective = points_style



********************************************************************************
***** topic diversity
gen sports_human = 0
replace sports_human = 1 if topic == "LABEL_3" | topic == "LABEL_4"

gen pol_macro_epi = 0
replace pol_macro_epi = 1 if topic == "LABEL_0" & level == "LABEL_0" & context == "LABEL_1"

gen pol_macro_them = 0
replace pol_macro_them = 1 if topic == "LABEL_0" & level == "LABEL_0" & context == "LABEL_0"

gen pol_meso = 0
replace pol_meso = 1 if topic == "LABEL_0" & level == "LABEL_1"

gen pol_micro = 0
replace pol_micro = 1 if topic == "LABEL_0" & (level == "LABEL_2" | level == "LABEL_3")
  
gen econ_macro = 0
replace econ_macro = 1 if topic == "LABEL_1" & level == "LABEL_0"

gen econ_meso_micro = 0
replace econ_meso_micro = 1 if topic == "LABEL_1" & (level == "LABEL_1" | level == "LABEL_2" | level == "LABEL_3")

gen arts_culture = 0
replace arts_culture = 1 if topic == "LABEL_2"



***** geo diversity (in contrast to Bachmann et al. 2022, we can include local news)
gen geo_loc = 0
replace geo_loc = 1 if geo == "LABEL_0"

gen geo_nat = 0
replace geo_nat = 1 if geo == "LABEL_1"

gen geo_foreign = 0
replace geo_foreign = 1 if geo == "LABEL_3"

gen geo_multi = 0
replace geo_multi = 1 if geo == "LABEL_4"

foreach var of varlist sports_human pol_macro_epi pol_macro_them pol_meso pol_micro econ_macro econ_meso_micro arts_culture geo_loc geo_nat geo_foreign geo_multi {
	replace `var' = . if type == "LABEL_1"
}

save "article level quality data.dta", replace


 
* share of journalistic content 
use "article level quality data.dta", clear
collapse (mean) share_journ = articles_journ words_avg_journ words_avg_nonjourn ///
 (sum) articles_journ articles_nonjourn words_tot_journ words_tot_nonjourn, by(outlet tm)

save "share_type_temp.dta", replace



***** Shannon diversity indices are calculated at the newspaper-month level
* weighted by number of words

use "article level quality data.dta", clear
keep if type == "LABEL_0"

foreach var of varlist topic level context format style {
	quietly tabulate `var', generate(`var'_)
}


collapse (mean) sports_human pol_macro_epi pol_macro_them pol_meso pol_micro econ_macro ///
 econ_meso_micro arts_culture geo_loc geo_nat geo_foreign geo_multi ///
 points_relevance points_context_overall points_objective ///
 points_topic points_level points_context points_format points_style ///
 agency dup_headline dup_text dup_headline_text topic_* level_* context_* format_* style_* ///
 [fw=words_tot_journ], by(outlet tm)



merge 1:1 outlet tm using "share_type_temp.dta"
keep if _merge == 3
drop _merge
erase "share_type_temp.dta"


foreach var of varlist sports_human pol_macro_epi pol_macro_them pol_meso pol_micro econ_macro econ_meso_micro arts_culture geo_loc geo_nat geo_foreign geo_multi {
	gen log_`var' = log(`var'+0.01)
}


gen topic_diversity = ///
    (sports_human*log_sports_human + ///
	 pol_macro_epi*log_pol_macro_epi + ///
	 pol_macro_them*log_pol_macro_them + ///
	 pol_meso*log_pol_meso + ///
	 pol_micro*log_pol_micro + ///
	 econ_macro*log_econ_macro + ///
	 econ_meso_micro*log_econ_meso_micro + ///
	 arts_culture*log_arts_culture)/log(8)	  
replace topic_diversity = (topic_diversity^2)*10

gen geo_diversity = ///
    (geo_loc*log_geo_loc + ///
	 geo_nat*log_geo_nat + ///
	 geo_foreign*log_geo_foreign + ///
	 geo_multi*log_geo_multi)/log(4)
replace geo_diversity = (geo_diversity^2)*10

gen diversity = (topic_diversity*geo_diversity)/10


gen quality = (points_relevance + points_context_overall + points_objective + diversity)/4
drop if missing(quality)


merge 1:1 outlet tm using "newspapers and owners/newspaper month data.dta"
keep if _merge == 3
drop _merge

save "outlet month quality data.dta", replace









*********** cleaning up
use "article level quality data.dta", clear

keep id type topic level context format style geo predicted
order id predicted type topic level context format style geo

label variable id "Article ID"
label variable predicted "1 if based on machine classificaion, 0 if human classification"
label variable type "Article type"
label variable topic "Topic relevance"
label variable level "Actor relevance"
label variable context "Thematic orientation"
label variable format "Interpretative performance"
label variable style "Objectivity"
label variable geo "Geographical reference"

sort id
save "article level quality data.dta", replace









use "outlet month quality data.dta", clear

keep outlet owner tm year post subsidy_amount ///
 outlet_id owner_id post_merged month pandemic election_3m points_relevance points_context_overall points_objective ///
 geo_loc geo_nat geo_foreign geo_multi points_topic points_level points_context points_format points_style ///
 agency dup_headline_text topic_1 topic_2 topic_3 topic_4 topic_5 level_1 level_2 level_3 level_4 ///
 context_1 context_2 format_1 format_2 format_3 format_4 format_5 format_6 style_1 style_2 ///
 share_journ articles_journ topic_diversity geo_diversity diversity quality

order outlet outlet_id owner owner_id tm month year post_merged post ///
 pandemic election_3m subsidy_amount quality points_relevance points_context_overall points_objective diversity ///
 points_topic points_level points_context points_format points_style topic_diversity geo_diversity ///
 topic_1 topic_2 topic_3 topic_4 topic_5 level_1 level_2 level_3 level_4 ///
 context_1 context_2 format_1 format_2 format_3 format_4 format_5 format_6 style_1 style_2 ///
 geo_loc geo_nat geo_foreign geo_multi share_journ articles_journ agency dup_headline_text
 
 
label variable outlet "Newspaper name"
label variable outlet_id "Newspaper ID"
label variable owner "Owner name"
label variable owner_id "Owner ID"
label variable tm "Year-month"
label variable month "Month"
label variable year "Year"
label variable post_merged "1 if after merger"
label variable post "1 if after acquisiton"
label variable pandemic "1 if after February 2020" 
label variable election_3m "1 in the month of election and two preceding months"
label variable subsidy_amount "Operational support (in million SEK)"
label variable quality "Overall quality score"
label variable points_relevance "Relevance score"
label variable points_context_overall "Contextualization score"
label variable points_objective "Professionalism score"
label variable diversity "Diversity score (Shannon index)"
label variable points_topic "Topic relevance score"
label variable points_level "Actor relevance score"
label variable points_context "Thematic orientation score"
label variable points_format "Interpretative performance score"
label variable points_style "Objectivity score"
label variable topic_diversity "Content diversity"
label variable geo_diversity "Geographical diversity"
label variable topic_1 "Share of articles: politics"
label variable topic_2 "Share of articles: economics/business"
label variable topic_3 "Share of articles: arts/culture"
label variable topic_4 "Share of articles: sports"
label variable topic_5 "Share of articles: human interest/other"
label variable level_1 "Share of articles: macro level"
label variable level_2 "Share of articles: meso level"
label variable level_3 "Share of articles: micro level (functional)"
label variable level_4 "Share of articles: micro level (other)"
label variable context_1 "Share of articles: thematic reporting"
label variable context_2 "Share of articles: episodic reporting"
label variable format_1 "Share of articles: news flash"
label variable format_2 "Share of articles: news report"
label variable format_3 "Share of articles: list, agency report"
label variable format_4 "Share of articles: portrait, interview, review"
label variable format_5 "Share of articles: reportage"
label variable format_6 "Share of articles: opinion article"
label variable style_1 "Share of articles: cognitive-normative reporting"
label variable style_2 "Share of articles: moral-emotional reporting"
label variable geo_loc "Share of articles: local/regional"
label variable geo_nat "Share of articles: national/bilateral"
label variable geo_foreign "Share of articles: other country"
label variable geo_multi "Share of articles: international/multilateral"
label variable share_journ "Share of journalistic articles"
label variable articles_journ "Number of journalistic articles"
label variable agency "Share of articles based on news agency"
label variable dup_headline_text "Share of duplicated articles among co-owned papers (weighted by dupl. amount)"

save "outlet month quality data.dta", replace





